%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import display, display_markdown
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import subprocess as sp
import numpy as np
import pandas as pd
import seaborn as sns
import arviz as az
import bambi
import copy
import warnings
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 300
from scipy.stats import pearsonr, spearmanr
from itertools import chain
from collections import Counter
from subs2vec.utensils import log_timer
from subs2vec.vecs import Vectors
from subs2vec.neighbors import compute_nn
def display_md(md, **kwargs):
return display_markdown(md, raw=True, **kwargs)
def convert_notebook(title, output='html'):
convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' '))
if convert.returncode == 0:
display_md(f'Jupyter notebook `{title}` converted successfully.')
else:
display_md(f'Error: encountered problem converting Jupyter notebook `{title}`')
def download(fname):
dl = sp.run(f'wget {fname}'.split(' '))
if dl.returncode == 0:
display_md(f'Download of `{fname}` succesful.')
else:
display_md(f'Download of `{fname}` failed.')
@log_timer
def filter_vecs(vecs, filter_words):
filtered_vecs = copy.deepcopy(vecs)
filtered_vecs.vectors = filtered_vecs.vectors[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.words = filtered_vecs.words[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.n = len(filtered_vecs.words)
display_md(f'Filtered {vecs.n} vectors, {filtered_vecs.n} remaining.')
return filtered_vecs
def norm(x):
return x / np.linalg.norm(x, 2)
sns.set(style='whitegrid')
pd.options.mode.chained_assignment = None
df = pd.read_csv('data/saysani_data.tsv', sep='\t')
display(df)
| participant | white | red | orange | yellow | green | blue | purple | brown | black | dimension | group | pp_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 7 | 7 | 5 | 1 | 1 | 1 | 3 | 7 | cold-hot | sighted | sighted_1 |
| 1 | 1 | 7 | 1 | 4 | 2 | 3 | 3 | 6 | 6 | 7 | ripe-unripe | sighted | sighted_1 |
| 2 | 1 | 1 | 5 | 6 | 7 | 4 | 2 | 3 | 7 | 6 | new-old | sighted | sighted_1 |
| 3 | 1 | 1 | 7 | 2 | 1 | 4 | 2 | 3 | 5 | 7 | submissive-aggressive | sighted | sighted_1 |
| 4 | 1 | 1 | 7 | 6 | 1 | 2 | 2 | 5 | 3 | 5 | selfless-jealous | sighted | sighted_1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 539 | 12 | 1 | 2 | 1 | 2 | 2 | 3 | 2 | 3 | 4 | soft-hard | blind | blind_12 |
| 540 | 12 | 4 | 3 | 3 | 4 | 2 | 2 | 3 | 2 | 5 | light-heavy | blind | blind_12 |
| 541 | 12 | 2 | 4 | 2 | 1 | 2 | 1 | 2 | 3 | 2 | relaxed-tense | blind | blind_12 |
| 542 | 12 | 4 | 2 | 1 | 1 | 1 | 3 | 2 | 3 | 5 | alive-dead | blind | blind_12 |
| 543 | 12 | 6 | 7 | 4 | 3 | 4 | 4 | 1 | 2 | 5 | fast-slow | blind | blind_12 |
544 rows × 13 columns
# these are the colors in the data
colors = ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'black']
# melt
df_orig = df.melt(
id_vars=['group', 'dimension', 'pp_id'],
value_vars=colors,
var_name='color',
value_name='rating',
)
# pull out dimension words
dimension_labels = df_orig['dimension'].unique()
dimension_pairs = [pair.split('-') for pair in dimension_labels]
dimensions = list(chain(*dimension_pairs))
# add experiment and self vs. other variables for when we add the replication experiment later
df_orig['experiment'] = 'original'
df_orig['self_vs_other'] = 'self'
display(df_orig)
| group | dimension | pp_id | color | rating | experiment | self_vs_other | |
|---|---|---|---|---|---|---|---|
| 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self |
| 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self |
| 2 | sighted | new-old | sighted_1 | white | 1 | original | self |
| 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self |
| 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 4891 | blind | soft-hard | blind_12 | black | 4 | original | self |
| 4892 | blind | light-heavy | blind_12 | black | 5 | original | self |
| 4893 | blind | relaxed-tense | blind_12 | black | 2 | original | self |
| 4894 | blind | alive-dead | blind_12 | black | 5 | original | self |
| 4895 | blind | fast-slow | blind_12 | black | 5 | original | self |
4896 rows × 7 columns
df_rep = pd.read_csv('data/replication1_data.csv')
# little bit of data munging, drop test participant and catch trials
df_rep = df_rep[(df_rep['pp_id'] != 3) & (df_rep['question_type'] != 'catch')]
df_rep = df_rep.drop(columns=['question_type', 'prompt_pre_1'])
# melt to long format
df_rep = df_rep.melt(
id_vars=['dimension', 'color', 'pp_id'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_rep['pp_id'] = 'sighted_' + df_rep['pp_id'].astype(str)
df_rep['self_vs_other'] = df_rep['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_rep['group'] = 'sighted'
df_rep['experiment'] = 'replication_1'
# there is a weird typo in one of the dimensions (?), so let's correct that here as well
df_rep['dimension'] = df_rep['dimension'].replace({'like-dis...like': 'like-dislike'})
display(df_rep)
| dimension | color | pp_id | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|
| 0 | clean-dirty | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 1 | soft-hard | yellow | sighted_69819 | self | 2 | sighted | replication_1 |
| 2 | ripe-unripe | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| 3 | selfless-jealous | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 4 | high-low | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 9567 | like-dislike | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9568 | new-old | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9569 | clean-dirty | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9570 | relaxed-tense | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9571 | active-passive | orange | sighted_69785 | other | 3 | sighted | replication_1 |
9572 rows × 7 columns
df_read = pd.read_csv('data/replication2_data_with_reading.csv').drop(columns=['Unnamed: 0', 'X'])
display(df_read)
| dimension | group | subj_id | color | value | question_type | others_choice | art | fiction | nonfiction | ... | Q9_17 | Q9_18 | Q9_19 | Q9_20 | Q9_21 | composite_read | upper_art | upper_fiction | upper_nonfiction | upper_read_motivation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | replication-sighted | 69212 | brown | 4 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 1 | ripe-unripe | replication-sighted | 69212 | brown | 7 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 2 | new-old | replication-sighted | 69212 | brown | 6 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 3 | submissive-aggressive | replication-sighted | 69212 | brown | 2 | semantic_diff | 2 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 4 | selfless-jealous | replication-sighted | 69212 | brown | 5 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14251 | light-heavy | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14252 | relaxed-tense | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14253 | alive-dead | replication-sighted | 68129 | red | 7 | semantic_diff | 6 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14254 | fast-slow | replication-sighted | 68129 | red | 1 | semantic_diff | 3 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14255 | high-low | replication-sighted | 68129 | red | 1 | semantic_diff | 2 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
14256 rows × 36 columns
df_read['reading_motivation'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_14'] * -1
+ x['Q9_15']
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_18']
+ x['Q9_19']
+ x['Q9_20']
+ x['Q9_21']) / 21,
axis=1)
df_read['reading_part_of_self'] = df_read.apply(
lambda x: (0
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']) / 8,
axis=1)
df_read['reading_efficacy'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_14'] * -1
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_19']
+ x['Q9_20']) / 6,
axis=1)
df_read['reading_recognition'] = df_read.apply(
lambda x: (0
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_15']) / 3,
axis=1)
df_read['reading_other_realms'] = df_read.apply(
lambda x: (0
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_18']
+ x['Q9_21']) / 4,
axis=1)
# rename participant id column to match earlier datasets
df_read = df_read.rename(columns={'subj_id': 'pp_id'})
# melt to long format
df_read = df_read.melt(
id_vars=['dimension', 'color', 'pp_id', 'art', 'fiction', 'nonfiction', 'reading_motivation',
'reading_part_of_self', 'reading_efficacy', 'reading_recognition', 'reading_other_realms'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_read['pp_id'] = 'sighted_' + df_read['pp_id'].astype(str)
df_read['self_vs_other'] = df_read['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_read['group'] = 'sighted'
df_read['experiment'] = 'replication_2'
display(df_read)
| dimension | color | pp_id | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 4 | sighted | replication_2 |
| 1 | ripe-unripe | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 7 | sighted | replication_2 |
| 2 | new-old | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 6 | sighted | replication_2 |
| 3 | submissive-aggressive | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 2 | sighted | replication_2 |
| 4 | selfless-jealous | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 5 | sighted | replication_2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28507 | light-heavy | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28508 | relaxed-tense | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28509 | alive-dead | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 6 | sighted | replication_2 |
| 28510 | fast-slow | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 3 | sighted | replication_2 |
| 28511 | high-low | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 2 | sighted | replication_2 |
28512 rows × 15 columns
df_read.describe()
| art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | rating | |
|---|---|---|---|---|---|---|---|---|---|
| count | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 28512.000000 |
| mean | 7.616279 | 0.593023 | 0.755814 | -0.107973 | -0.280523 | 0.203488 | -0.616279 | 0.151163 | 3.693147 |
| std | 6.612596 | 0.854251 | 0.987569 | 0.646210 | 0.879559 | 0.646210 | 0.910077 | 0.759909 | 1.424941 |
| min | -5.000000 | 0.000000 | 0.000000 | -1.619048 | -2.000000 | -1.000000 | -2.000000 | -2.000000 | 1.000000 |
| 25% | 3.000000 | 0.000000 | 0.000000 | -0.571429 | -1.000000 | -0.333333 | -1.333333 | -0.500000 | 3.000000 |
| 50% | 6.000000 | 0.000000 | 0.000000 | -0.119048 | -0.375000 | 0.166667 | -0.666667 | 0.250000 | 4.000000 |
| 75% | 10.000000 | 1.000000 | 1.000000 | 0.285714 | 0.250000 | 0.666667 | 0.000000 | 0.500000 | 5.000000 |
| max | 26.000000 | 4.000000 | 4.000000 | 1.619048 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 7.000000 |
corrs = df_read[['art', 'fiction', 'nonfiction', 'reading_motivation', 'reading_part_of_self',
'reading_efficacy', 'reading_recognition', 'reading_other_realms']].corr().round(2)
mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True, mask=mask)
g.set_yticklabels(g.get_yticklabels(), rotation=0);
g = sns.histplot(x='art', data=df_read)
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs = filter_vecs(vecs, np.array(colors + dimensions))
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([norm(vecs_dict[pair[0]] - vecs_dict[pair[1]]) for pair in dimension_pairs])
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.337 seconds
Filtered 200000 vectors, 43 remaining.
[INFO] <function filter_vecs at 0x16cc52830> ran in 0.266 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds
Filtered 43 vectors, 9 remaining.
[INFO] <function filter_vecs at 0x16cc52830> ran in 0.001 seconds
Filtered 43 vectors, 34 remaining.
[INFO] <function filter_vecs at 0x16cc52830> ran in 0.001 seconds
dimension_neighbors = compute_nn(color_vecs, dimension_vecs.vectors, dimension_vecs.words, num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | like | white | black | yellow | orange | green | purple | blue | brown | red |
| 1 | old | white | brown | black | yellow | orange | green | blue | red | purple |
| 2 | new | black | white | green | yellow | red | blue | purple | orange | brown |
| 3 | light | yellow | orange | blue | green | red | purple | white | brown | black |
| 4 | hard | white | brown | black | red | orange | green | yellow | purple | blue |
| 5 | dead | black | white | brown | red | green | purple | yellow | orange | blue |
| 6 | cold | blue | black | white | green | brown | red | yellow | purple | orange |
| 7 | happy | white | orange | brown | yellow | purple | red | green | blue | black |
| 8 | hot | red | yellow | black | orange | purple | white | blue | green | brown |
| 9 | heavy | black | purple | red | blue | brown | green | yellow | orange | white |
| 10 | fast | red | white | yellow | black | blue | green | orange | brown | purple |
| 11 | soft | brown | green | yellow | purple | orange | red | white | blue | black |
| 12 | clean | white | blue | black | brown | yellow | green | red | orange | purple |
| 13 | slow | red | yellow | purple | blue | brown | black | green | orange | white |
| 14 | angry | orange | red | purple | black | white | yellow | blue | brown | green |
| 15 | alive | green | brown | orange | red | yellow | black | blue | purple | white |
| 16 | sad | brown | red | green | black | purple | yellow | blue | orange | white |
| 17 | fresh | green | red | white | blue | yellow | brown | black | purple | orange |
| 18 | calm | blue | green | white | brown | purple | red | black | yellow | orange |
| 19 | dirty | brown | yellow | blue | white | red | black | orange | green | purple |
| 20 | dull | brown | green | red | blue | yellow | orange | purple | black | white |
| 21 | relaxed | blue | white | green | yellow | red | purple | brown | orange | black |
| 22 | jealous | purple | red | black | orange | white | yellow | green | blue | brown |
| 23 | tense | white | black | blue | red | brown | green | orange | yellow | purple |
| 24 | exciting | green | orange | purple | blue | black | red | white | brown | yellow |
| 25 | active | orange | black | green | white | brown | red | blue | purple | yellow |
| 26 | ripe | orange | green | purple | yellow | red | brown | blue | black | white |
| 27 | aggressive | orange | yellow | black | white | brown | red | green | blue | purple |
| 28 | stale | brown | orange | yellow | white | green | red | purple | blue | black |
| 29 | dislike | purple | brown | black | orange | green | yellow | red | white | blue |
| 30 | passive | black | white | blue | green | red | brown | purple | orange | yellow |
| 31 | selfless | black | white | brown | blue | orange | purple | red | green | yellow |
| 32 | submissive | white | brown | black | purple | green | blue | orange | yellow | red |
| 33 | unripe | orange | purple | yellow | red | brown | green | black | blue | white |
dimension_neighbors = compute_nn(color_vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | blue | green | brown | white | black | purple | yellow | red | orange |
| 1 | ripe-unripe | green | orange | purple | red | brown | yellow | blue | black | white |
| 2 | new-old | green | red | purple | black | yellow | blue | white | orange | brown |
| 3 | submissive-aggressive | purple | white | brown | black | blue | green | red | yellow | orange |
| 4 | selfless-jealous | brown | white | black | blue | orange | green | yellow | purple | red |
| 5 | active-passive | orange | green | brown | red | yellow | purple | black | blue | white |
| 6 | like-dislike | white | black | yellow | orange | blue | green | red | brown | purple |
| 7 | clean-dirty | white | blue | green | black | orange | purple | red | yellow | brown |
| 8 | fresh-stale | green | blue | red | white | black | purple | yellow | orange | brown |
| 9 | calm-angry | blue | green | brown | white | yellow | black | purple | red | orange |
| 10 | happy-sad | white | orange | yellow | purple | blue | green | red | black | brown |
| 11 | exciting-dull | orange | white | purple | black | blue | red | green | yellow | brown |
| 12 | soft-hard | green | purple | yellow | blue | orange | brown | red | white | black |
| 13 | light-heavy | yellow | orange | blue | white | green | red | purple | brown | black |
| 14 | relaxed-tense | purple | yellow | orange | blue | green | white | brown | red | black |
| 15 | alive-dead | green | orange | yellow | blue | red | brown | purple | white | black |
| 16 | fast-slow | white | black | green | orange | red | blue | yellow | brown | purple |
df_joint = pd.concat([df_orig, df_rep, df_read]).reset_index()
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
42980 rows × 16 columns
freqs = pd.read_csv('../datasets/dedup.en.words.unigrams.tsv', sep='\t') # not included in git repo
freqs['log_freq'] = np.log(freqs['unigram_freq'])
freqs = freqs.drop(columns='unigram_freq')
display(freqs.round(2))
| unigram | log_freq | |
|---|---|---|
| 0 | the | 17.10 |
| 1 | you | 17.06 |
| 2 | i | 17.04 |
| 3 | to | 16.78 |
| 4 | a | 16.59 |
| ... | ... | ... |
| 2397976 | tpar1 | 0.00 |
| 2397977 | giacoia | 0.00 |
| 2397978 | ourcinders | 0.00 |
| 2397979 | tourret | 0.00 |
| 2397980 | iroki | 0.00 |
2397981 rows × 2 columns
df_joint['word1'] = df_joint['dimension'].apply(lambda x: x.split('-')[0])
df_joint['word2'] = df_joint['dimension'].apply(lambda x: x.split('-')[1])
df_joint = df_joint.merge(freqs, left_on='word1', right_on='unigram', how='left')
df_joint = df_joint.merge(freqs, left_on='word2', right_on='unigram', how='left')
df_joint['frequency'] = df_joint['log_freq_x'] - df_joint['log_freq_y']
df_joint = df_joint.drop(columns=[
'unigram_x',
'unigram_y',
'log_freq_x',
'log_freq_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 |
42980 rows × 19 columns
concreteness = pd.read_csv('../datasets/en-brysbaert-2014.tsv', sep='\t') # not included in git repo
display(concreteness)
| word | concreteness | |
|---|---|---|
| 0 | a | 1.46 |
| 1 | aardvark | 4.68 |
| 2 | aback | 1.65 |
| 3 | abacus | 4.52 |
| 4 | abandon | 2.54 |
| ... | ... | ... |
| 37053 | zoologist | 4.30 |
| 37054 | zoology | 3.37 |
| 37055 | zoom | 3.10 |
| 37056 | zoophobia | 2.04 |
| 37057 | zucchini | 4.87 |
37058 rows × 2 columns
df_joint = df_joint.merge(concreteness, left_on='word1', right_on='word', how='left')
df_joint = df_joint.merge(concreteness, left_on='word2', right_on='word', how='left')
df_joint['concreteness'] = df_joint['concreteness_x'] - df_joint['concreteness_y']
df_joint = df_joint.drop(columns=[
'word_x',
'word_y',
'concreteness_x',
'concreteness_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | concreteness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 | -0.46 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 | -0.01 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 | 0.09 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 | -0.82 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 | -0.56 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 | 0.84 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 | 0.15 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 | -0.93 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 | 0.04 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 | 0.12 |
42980 rows × 20 columns
swow = pd.read_csv('../datasets/SWOW-EN.R100.csv') # not included in git repo
display(swow)
| Unnamed: 0 | id | participantID | age | gender | nativeLanguage | country | education | created_at | cue | R1 | R2 | R3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 29 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | although | nevertheless | yet | but |
| 1 | 2 | 30 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | deal | no | cards | shake |
| 2 | 3 | 31 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | music | notes | band | rhythm |
| 3 | 4 | 32 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | inform | tell | rat on | NaN |
| 4 | 5 | 33 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | way | path | via | method |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1228195 | 1228196 | 1530300 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | strange | mask | weird | stranger |
| 1228196 | 1228197 | 1530290 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | sunset | sea | sky | clause |
| 1228197 | 1228198 | 1530291 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | useless | pitty | worthless | worth |
| 1228198 | 1228199 | 1530284 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | volume | loud | music | key |
| 1228199 | 1228200 | 1530288 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | whenever | who | where | always |
1228200 rows × 13 columns
def add_swow(df, swow, colname):
swow = pd.DataFrame(swow.groupby('cue')['resp'].value_counts()).rename(columns={'resp': 'n'})
swow = swow.reset_index()
df = df.merge(swow, left_on=['word1', 'color'], right_on=['cue', 'resp'], how='left')
df = df.merge(swow, left_on=['word2', 'color'], right_on=['cue', 'resp'], how='left')
df['n_x'] = df['n_x'].fillna(0)
df['n_y'] = df['n_y'].fillna(0)
df[colname] = df['n_x'] - df['n_y']
df = df.drop(columns=[
'cue_x',
'cue_y',
'resp_x',
'resp_y',
'n_x',
'n_y',
])
return df
swow = swow[swow['cue'].isin(dimensions)]
swow_NZ = swow[(swow['country'] == 'New Zealand')] # select only NZ respondents
swow_US = swow[(swow['country'] == 'United States')] # select only US respondents
# count only R1 (maximal discounting)
df_joint = add_swow(df_joint, swow.rename(columns={'R1': 'resp'}), 'swow_R1')
df_joint = add_swow(df_joint, swow_NZ.rename(columns={'R1': 'resp'}), 'swow_R1_NZ') # US
df_joint = add_swow(df_joint, swow_US.rename(columns={'R1': 'resp'}), 'swow_R1_US') # NZ
# count R1, R2, and R3 with equal weight (minimal discounting)
swow_all = swow.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all, 'swow_all')
# NZ
swow_all_NZ = swow_NZ.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_NZ, 'swow_all_NZ')
# US
swow_all_US = swow_US.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_US, 'swow_all_US')
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | cold | hot | -0.216432 | -0.46 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | ripe | unripe | 3.485549 | -0.01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | new | old | 0.119068 | 0.09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | submissive | aggressive | -2.352148 | -0.82 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | relaxed | tense | -0.229652 | 0.15 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | alive | dead | -0.904786 | -0.93 | -1.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | fast | slow | 0.763262 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | high | low | 1.237676 | 0.12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
42980 rows × 26 columns
# check how many participants gave green as a response to various cues (to use as an example in the paper)
counts = swow_all_US.groupby(['cue', 'resp']).count().reset_index()
display(counts[counts['resp'] == 'green'])
| cue | resp | id | participantID | created_at | variable | |
|---|---|---|---|---|---|---|
| 233 | alive | green | 1 | 1 | 1 | 1 |
| 508 | clean | green | 1 | 1 | 1 | 1 |
| 1108 | exciting | green | 1 | 1 | 1 | 1 |
| 1289 | fresh | green | 1 | 1 | 1 | 1 |
| 1456 | hard | green | 1 | 1 | 1 | 1 |
| 1706 | jealous | green | 20 | 20 | 20 | 20 |
| 1984 | new | green | 1 | 1 | 1 | 1 |
| 3010 | unripe | green | 18 | 18 | 18 | 18 |
display(df_joint.sort_values('swow_all'))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25920 | 11452 | sighted | selfless-jealous | sighted_68676 | green | 2 | replication_2 | self | 4.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 5138 | 242 | sighted | selfless-jealous | sighted_68736 | green | 2 | replication_1 | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 37530 | 23062 | sighted | selfless-jealous | sighted_67653 | green | 6 | replication_2 | other | 10.0 | 4.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 25416 | 10948 | sighted | selfless-jealous | sighted_69192 | green | 7 | replication_2 | self | 9.0 | 1.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 16956 | 2488 | sighted | selfless-jealous | sighted_68719 | green | 5 | replication_2 | self | 3.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 466 | 466 | blind | clean-dirty | blind_8 | white | 2 | original | self | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 14129 | 9233 | sighted | clean-dirty | sighted_68738 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 33057 | 18589 | sighted | light-heavy | sighted_68150 | white | 2 | replication_2 | other | 9.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
| 12790 | 7894 | sighted | clean-dirty | sighted_68946 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 21537 | 7069 | sighted | light-heavy | sighted_67884 | white | 2 | replication_2 | self | 5.0 | 1.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
42980 rows × 26 columns
(It looks like there very few responses from NZ, but a little more from US and elsewhere.)
def get_cosine(x, vecs_dict):
zero = np.zeros(300)
return np.dot(norm(vecs_dict.get(x['word2'], zero) - vecs_dict.get(x['word1'], zero)),
vecs_dict.get(x['color'], zero))
vecs = Vectors('../embeddings/cc.en.300.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_cc'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/cc.en.300.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.176 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.056 seconds
vecs = Vectors('../embeddings/subs.en.1e6.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_subs'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/subs.en.1e6.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.236 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.043 seconds
# academic
vecs = Vectors('../embeddings/acad.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_acad'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# magazines
vecs = Vectors('../embeddings/mag.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_mag'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# spoken
vecs = Vectors('../embeddings/spok.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_spok'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# news
vecs = Vectors('../embeddings/news.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_news'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
display(df_joint.round(2))
[INFO] loading vectors ../embeddings/acad.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.205 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.044 seconds [INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.082 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.041 seconds [INFO] loading vectors ../embeddings/mag.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 5.965 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.040 seconds [INFO] loading vectors ../embeddings/spok.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.355 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.063 seconds [INFO] loading vectors ../embeddings/news.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.311 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.039 seconds
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | swow_all | swow_all_NZ | swow_all_US | cosine_cc | cosine_subs | cosine_acad | cosine_fic | cosine_mag | cosine_spok | cosine_news | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | 0.02 | 0.04 | 0.01 | -0.06 | 0.03 | -0.04 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.02 | 0.09 | 0.03 | 0.16 | 0.07 | -0.20 | 0.03 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.12 | 0.04 | 0.10 | 0.07 | 0.04 | 0.08 | 0.03 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | -0.08 | -0.08 | -0.09 | -0.01 | -0.07 | -0.05 | -0.03 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | -0.01 | -0.01 | -0.01 | -0.01 | 0.11 | 0.01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.16 | -0.04 | -0.06 | -0.08 | -0.16 | -0.18 | -0.08 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.11 | -0.04 | 0.09 | -0.02 | -0.01 | 0.04 | 0.06 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | -1.0 | 0.0 | 0.0 | 0.15 | 0.06 | 0.02 | 0.05 | -0.00 | 0.05 | 0.10 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.00 | -0.02 | -0.01 | -0.00 | -0.05 | 0.04 | -0.03 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.01 | 0.04 | 0.01 | -0.04 | 0.06 | 0.12 | 0.08 |
42980 rows × 33 columns
COCA embeddings, but from COCA corpora without sentences with 1st order cooccurrences (sentences with a color word and a dimension word).
vecs = Vectors('../embeddings/fic_no_1st_order.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_1st_order'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic_no_1st_order.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.625 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.049 seconds
COCA embeddings, but from training corpora from which the 25 nearest neighbors of each color and dimension word have been removed (in an attempt to disrupt the "scaffolding" that semantic associations with the colors and dimension words are built on).
We use two filtering regimes, a strong and a weak one. In the strong regime we remove every line that contains any neighbor word. In the weak regime we remove any of the following:
vecs = Vectors('../embeddings/fic_no_neighbors_strong_no1st.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_strong'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
vecs = Vectors('../embeddings/fic_no_neighbors_weak.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_weak'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_neighbors_strong_no1st.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 4.816 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.031 seconds [INFO] loading vectors ../embeddings/fic_no_neighbors_weak.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.479 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.043 seconds
COCA embeddings, but from training corpora from which the labels generated by at least two participants for color-semantic associations (e.g. the label snow for the combination white and cold) has been removed. (These nameability data are explored in more detail in a section at the end of this notebook.)
# fiction
vecs = Vectors('../embeddings/fic_no_mediators.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_mediators'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_mediators.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 5.644 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.038 seconds /var/folders/py/jxr8v1g13rs0lgpdf083hc7c0000gn/T/ipykernel_20191/653768335.py:57: RuntimeWarning: invalid value encountered in true_divide return x / np.linalg.norm(x, 2)
df_orig = df_joint[df_joint['experiment'] == 'original']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
df_rep = df_joint[df_joint['experiment'] == 'replication_2']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
def standardize(Series):
return (Series - Series.mean()) / Series.std()
df_joint['art_z'] = standardize(df_joint['art'])
df_joint['fiction_z'] = standardize(df_joint['fiction'])
df_joint['nonfiction_z'] = standardize(df_joint['nonfiction'])
df_joint['reading_motivation_z'] = standardize(df_joint['reading_motivation'])
df_joint['reading_part_of_self_z'] = standardize(df_joint['reading_part_of_self'])
df_joint['reading_efficacy_z'] = standardize(df_joint['reading_efficacy'])
df_joint['reading_recognition_z'] = standardize(df_joint['reading_recognition'])
df_joint['reading_other_realms_z'] = standardize(df_joint['reading_other_realms'])
df_joint['rating_z'] = standardize(df_joint['rating'])
df_joint['frequency_z'] = standardize(df_joint['frequency'])
df_joint['concreteness_z'] = standardize(df_joint['concreteness'])
df_joint['swow_all_z'] = standardize(df_joint['swow_all'])
df_joint['swow_all_NZ_z'] = standardize(df_joint['swow_all_NZ'])
df_joint['swow_all_US_z'] = standardize(df_joint['swow_all_US'])
df_joint['swow_R1_z'] = standardize(df_joint['swow_R1'])
df_joint['swow_R1_NZ_z'] = standardize(df_joint['swow_R1_NZ'])
df_joint['swow_R1_US_z'] = standardize(df_joint['swow_R1_US'])
df_joint['cosine_cc_z'] = standardize(df_joint['cosine_cc'])
df_joint['cosine_subs_z'] = standardize(df_joint['cosine_subs'])
df_joint['cosine_acad_z'] = standardize(df_joint['cosine_acad'])
df_joint['cosine_fic_z'] = standardize(df_joint['cosine_fic'])
df_joint['cosine_mag_z'] = standardize(df_joint['cosine_mag'])
df_joint['cosine_news_z'] = standardize(df_joint['cosine_news'])
df_joint['cosine_spok_z'] = standardize(df_joint['cosine_spok'])
df_joint['cosine_fic_no_1st_order_z'] = standardize(df_joint['cosine_fic_no_1st_order'])
df_joint['cosine_fic_no_neighbors_weak_z'] = standardize(df_joint['cosine_fic_no_neighbors_weak'])
df_joint['cosine_fic_no_neighbors_strong_z'] = standardize(df_joint['cosine_fic_no_neighbors_strong'])
df_joint['cosine_fic_no_mediators_z'] = standardize(df_joint['cosine_fic_no_mediators'])
df_joint['blind'] = pd.get_dummies(df_joint['group'])['blind']
df_joint['sighted'] = pd.get_dummies(df_joint['group'])['sighted']
df_joint['group_eff'] = (df_joint['sighted'] - .5) * 2
df_joint['group_z'] = standardize(df_joint['sighted'])
df_joint['original'] = pd.get_dummies(df_joint['experiment'])['original']
df_joint['replication_1'] = pd.get_dummies(df_joint['experiment'])['replication_1']
df_joint['replication_2'] = pd.get_dummies(df_joint['experiment'])['replication_2']
df_joint['other'] = pd.get_dummies(df_joint['self_vs_other'])['other']
df_joint['self'] = pd.get_dummies(df_joint['self_vs_other'])['self']
df_joint['self_vs_other_eff'] = (df_joint['other'] - .5) * 2
df_joint['self_vs_other_z'] = standardize(df_joint['other'])
df_joint.to_csv('data/data_plus_predictors.tsv', sep='\t', index=False)
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | sighted | group_eff | group_z | original | replication_1 | replication_2 | other | self | self_vs_other_eff | self_vs_other_z | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
42980 rows × 76 columns
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')
def get_cosine_1word(x, vecs_dict):
zero = np.zeros(300)
return np.dot(vecs_dict.get(x['dimension'], zero), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_names = pd.read_csv('data/color_dimension_nameability.csv')
display(df_names.head())
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x14494eca0> ran in 6.137 seconds [INFO] <function Vectors.as_dict at 0x14494eee0> ran in 0.044 seconds
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat |
# check how many participants provided labels for each color-adjective pair
print(df_names['number_responses'].min())
print(df_names['number_responses'].max())
display(df_names.sort_values('modal_agreement'))
7 13
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99 | liked_blue | liked | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... |
| 40 | relaxed_blue | relaxed | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | smurfette,meditation,bird,water,tranquility,st... | 0.076923 | smurfette,meditation,bird,water,tranquility,st... |
| 30 | submissive_blue | submissive | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | macaw,nun,bird,swallow,butterfly,flower,door,b... | 0.076923 | macaw,nun,bird,swallow,butterfly,flowers,door,... |
| 91 | old_blue | old | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | bluecheese,necklace,bird,dress,shoe,smurfs,rug... | 0.076923 | bluecheese,necklace,bird,dress,shoes,smurfs,ru... |
| 192 | clean_yellow | clean | yellow | 12 | 1.083333 | 1.000000 | 1.000000 | 0.000000 | 0.083333 | table,detergant,sun,glove,hat,flag,ford,mustan... | 0.083333 | table,detergant,sun,gloves,hat,flag,ford.musta... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 151 | clean_white | clean | white | 9 | 1.111111 | 0.700000 | 0.600000 | 0.222222 | 0.555556 | sheet | 0.333333 | sheets |
| 170 | ripe_yellow | ripe | yellow | 12 | 1.000000 | 0.500000 | 0.500000 | 0.318182 | 0.583333 | banana | 0.583333 | banana |
| 68 | cold_blue | cold | blue | 13 | 1.000000 | 0.461538 | 0.461538 | 0.358974 | 0.615385 | ice | 0.615385 | ice |
| 147 | cold_white | cold | white | 9 | 1.000000 | 0.333333 | 0.333333 | 0.583333 | 0.777778 | snow | 0.777778 | snow |
| 158 | stale_white | stale | white | 9 | 1.000000 | 0.222222 | 0.222222 | 0.777778 | 0.888889 | bread | 0.888889 | bread |
306 rows × 12 columns
names = df_names['modal_names']
names = list(chain(*[name.split(',') for name in names]))
names_all = set(names) # all unique names
names_count = Counter(names)
names_2plus = [name[0] for name in names_count.most_common() if name[1] >= 2] # all names that occur 2+ times
print(f'Number of labels named by at least 2 participants: {len(names_2plus)}')
with open('data/pair_labels_all.txt', 'w') as namesfile:
namesfile.write('\n'.join(names_all))
with open('data/pair_labels_2plus.txt', 'w') as namesfile:
namesfile.writelines('\n'.join(names_2plus))
# let's ignore words like "me", "my", and "a" though
Number of labels named by at least 2 participants: 242
Since we only have nameability for colors and dimension axis poles (i.e. for yellow and dislike but not yellow and dislike-like), we correlate nameability measures with cosine similarity between color and dimension axis pole.
pearsonr(df_names['simpson_diversity'], df_names['modal_agreement'])
PearsonRResult(statistic=0.8947743710654124, pvalue=1.816739746708339e-108)
df_names['cosine_fic'] = df_names.apply(lambda x: get_cosine_1word(x, vecs_dict), axis=1)
display(df_names.head())
x = pearsonr(df_names['cosine_fic'], df_names['simpson_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_names['cosine_fic'], df_names['modal_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
g = sns.lmplot(x='cosine_fic', y='simpson_diversity', data=df_names)
g = sns.lmplot(x='cosine_fic', y='modal_agreement', data=df_names)
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | cosine_fic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy | 0.142680 |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.326845 |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood | 0.193040 |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark | 0.160328 |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat | 0.274516 |
pearsonr(cosine_fiction, simpson_diversity): 0.185, p-value: 0.001 pearsonr(cosine_fiction, modal_agreement): 0.203, p-value: 0.000
Since we do not have human ratings for the association between colors and dimension axis poles (only for association between colors and dimension axes), we need to collapse our nameability measures for the two poles of each dimension axis. One way to do this is to compute difference scores.
df_sighted = df_joint.loc[df_joint['group'] == 'sighted']
df_sighted['diversity_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['diversity_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['agreement_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['agreement_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['diff_diversity'] = (df_sighted['diversity_word1'] - df_sighted['diversity_word2'])
df_sighted['diff_agreement'] = (df_sighted['agreement_word1'] - df_sighted['agreement_word2'])
df_sighted = df_sighted.dropna()
display(df_sighted.head())
df_mean_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).mean().reset_index()
df_sd_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).std().reset_index()
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(rating, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | other | self | self_vs_other_eff | self_vs_other_z | diversity_word1 | diversity_word2 | agreement_word1 | agreement_word2 | diff_diversity | diff_agreement | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14468 | 0 | sighted | cold-hot | sighted_69212 | brown | 4 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.012821 | 0.142857 | 0.285714 | -0.012821 | -0.142857 |
| 14469 | 1 | sighted | ripe-unripe | sighted_69212 | brown | 7 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.238095 | 0.035714 | 0.428571 | 0.285714 | 0.202381 | 0.142857 |
| 14470 | 2 | sighted | new-old | sighted_69212 | brown | 6 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14471 | 3 | sighted | submissive-aggressive | sighted_69212 | brown | 2 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14472 | 4 | sighted | selfless-jealous | sighted_69212 | brown | 5 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.044444 | 0.142857 | 0.285714 | -0.044444 | -0.142857 |
5 rows × 82 columns
pearsonr(rating, simpson_diversity_difference): 0.036, p-value: 0.666 pearsonr(rating, modal_agreement_difference): -0.012, p-value: 0.890 pearsonr(cosine_fiction, simpson_diversity_difference): 0.091, p-value: 0.278 pearsonr(cosine_fiction, modal_agreement_difference): 0.032, p-value: 0.707
df_mean_sighted['rating_sd'] = df_sd_sighted['rating']
g = sns.lmplot(x='rating_sd', y='diff_diversity', data=df_mean_sighted)
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001 pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
One other way to work around the issue of having only color to dimension axis pole nameability is to split and invert the human ratings of color-dimension axis associations to create two scores per rating: One for the right end of the axis (equal to the rating), and one for the left end of the axis (equal to eight minus the rating). For example: If yellow is assigned a 6 on the scale dislike-like, the rating for yellow/like is 6, but we also create a rating of 2 for yellow/dislike.
df_inverse = df_sighted[[
'color',
'word1',
'rating',
'diversity_word1',
'agreement_word1'
]].rename(columns={
'word1': 'dimension',
'diversity_word1': 'simpson_diversity',
'agreement_word1': 'modal_agreement'
})
df_inverse['rating'] = 8 - df_inverse['rating']
df_inverse = pd.concat([df_inverse, df_sighted[[
'color',
'word2',
'rating',
'diversity_word2',
'agreement_word2'
]].rename(columns={
'word2': 'dimension',
'diversity_word2': 'simpson_diversity',
'agreement_word2': 'modal_agreement'
})])
display(df_inverse)
df_mean_inverse = df_inverse.groupby(['color', 'dimension']).mean().reset_index()
df_sd_inverse = df_inverse.groupby(['color', 'dimension']).std().reset_index()
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
| color | dimension | rating | simpson_diversity | modal_agreement | |
|---|---|---|---|---|---|
| 14468 | brown | cold | 4 | 0.000000 | 0.142857 |
| 14469 | brown | ripe | 1 | 0.238095 | 0.428571 |
| 14470 | brown | new | 2 | 0.000000 | 0.142857 |
| 14471 | brown | submissive | 6 | 0.000000 | 0.142857 |
| 14472 | brown | selfless | 3 | 0.000000 | 0.142857 |
| ... | ... | ... | ... | ... | ... |
| 41138 | yellow | hard | 2 | 0.000000 | 0.125000 |
| 41139 | yellow | heavy | 2 | 0.000000 | 0.125000 |
| 41140 | yellow | tense | 2 | 0.000000 | 0.125000 |
| 41141 | yellow | dead | 2 | 0.000000 | 0.125000 |
| 41142 | yellow | slow | 2 | 0.000000 | 0.125000 |
46272 rows × 5 columns
pearsonr(rating, simpson_diversity): 0.062, p-value: 0.293 pearsonr(rating, modal_agreement): 0.070, p-value: 0.237
df_mean_inverse['rating_sd'] = df_sd_inverse['rating']
g = sns.lmplot(x='rating_sd', y='modal_agreement', data=df_mean_inverse)
g = sns.lmplot(x='rating_sd', y='simpson_diversity', data=df_mean_inverse)
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating_sd, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.228, p-value: 0.000 pearsonr(rating_sd, modal_agreement): 0.228, p-value: 0.000
In short: nameability (measured as simpson diversity and name agreement for the modal name) is weakly correlated with cosine similarity between colors and dimension axis poles, but not with human ratings, regardless of whether we fit the nameability to the ratings (by computing difference scores for the nameability measures) or fit the ratings to the nameability (by computing inverse ratings for the left poles of the dimension axes).
df_viz = df_joint[df_joint['dimension'] != 'high-low']
df_means = df_viz.groupby(['dimension', 'color', 'word1', 'word2']).mean().reset_index()
dim_order = df_means.groupby('dimension').std().sort_values('rating', ascending=False).reset_index()['dimension']
df_means = df_means.set_index('dimension').loc[dim_order].reset_index()
mins_idx = df_means.groupby(['dimension'])['rating'].transform(min) == df_means['rating']
mins = df_means[mins_idx]
maxs_idx = df_means.groupby(['dimension'])['rating'].transform(max) == df_means['rating']
maxs = df_means[maxs_idx]
df_mins = mins[['word2', 'dimension', 'color']].merge(df_viz[['word2', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word2'])
df_maxs = maxs[['word1', 'dimension', 'color']].merge(df_viz[['word1', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word1'])
display(df_mins)
display(df_maxs)
sns.set_style('darkgrid')
all_colors = {color: color for color in df_viz['color']}
fig, ax1 = plt.subplots(figsize=(3, 8))
sns.pointplot(data=df_viz, y='word1', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax1, errorbar=('ci', .95))
ax2 = ax1.twinx()
sns.pointplot(data=df_viz, y='word2', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax2, errorbar=('ci', .95))
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7]);
sns.set_style('whitegrid')
mins_colors = {color: color for color in mins['color']}
maxs_colors = {color: color for color in maxs['color']}
fig, ax1 = plt.subplots(figsize=(3, 7))
sns.violinplot(data=df_maxs, y='word1', x='rating', hue='color', #scale='width',
palette=maxs_colors, dodge=False, ax=ax1, inner=None, cut=0)
ax2 = ax1.twinx()
sns.violinplot(data=df_mins, y='word2', x='rating', hue='color', #scale='area',
palette=mins_colors, dodge=False, ax=ax2, inner=None, cut=0)
plt.setp(ax1.collections, alpha=.8)
plt.setp(ax2.collections, alpha=.8)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors, aspect=.5, sharex=True)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)
g.set(xlabel='COCA-fiction\nembedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.75, 7.25], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
df_sighted_mean = df_sighted.groupby(['dimension', 'color']).mean().reset_index()
df_blind_mean = df_blind.groupby(['dimension', 'color']).mean().reset_index()
df_ratings = df_sighted_mean[['dimension', 'color', 'rating']].merge(
df_blind_mean[['dimension', 'color', 'rating']], on=['dimension', 'color'], how='left'
).rename(columns={'rating_x': 'rating_sighted', 'rating_y': 'rating_blind'})
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
x='rating_sighted',
y='rating_blind',
hue='color',
palette=all_colors,
legend=False,
ax=ax,
data=df_ratings
)
ax.set(ylabel='mean blind association rating', xlabel='mean sighted association rating',
ylim=[1, 7], xlim=[1, 7]);
def annotate(df, color, dimension, x=0, y=0):
plt.text(df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_sighted'].values[0] + x,
df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_blind'].values[0] + y,
f'{color} on {dimension}', fontdict={'size': 'small'})
annotate(df_ratings, 'white', 'clean-dirty', -1.95, -.05)
annotate(df_ratings, 'blue', 'cold-hot', .05, -.20)
annotate(df_ratings, 'red', 'cold-hot', .1, -.05)
annotate(df_ratings, 'orange', 'cold-hot', .05, +.05)
annotate(df_ratings, 'black', 'cold-hot', -1.75, -.05)
annotate(df_ratings, 'red', 'relaxed-tense', .1, -.05)
#df_ratings.apply(lambda row: annotate(df_ratings, row['color'], row['dimension']), axis=1)
plt.savefig('figures/blind_vs_sighted_scatter.pdf', bbox_inches='tight')
convert_notebook('data_prep')
[NbConvertApp] Converting notebook data_prep.ipynb to html [NbConvertApp] Writing 11273209 bytes to data_prep.html
Jupyter notebook data_prep converted successfully.